; void ondas_asm (
; 	unsigned char *src,	rdi
; 	unsigned char *dst,	rsi
; 	int m,			rdx filas	alto
; 	int n,			rcx columnas	ancho
; 	int row_size		r8
; );

;	int x_0			r9
;	int y_0			[rbp + 16]

global ondas_asm

section .data
	WAVELENGTH: dd 64.0
	RADIUS:	dd 35.0
	PI:	dd 3.1415
	TRAINWIDTH: dd 3.4
	INDICES: dd 0.0
		 dd 1.0
		 dd 2.0
		 dd 3.0
	UNOS: dd 1.0
	SEIS: dd 6.0
	CIENTOVEINTE: dd 120.0
	MUCHO: dd 5040.0
	SESENTAYCUATRO: dd 64.0
	UN_MEDIO: dd 0.5

section .text

sin_taylor: ; x =xmm2
	movdqu xmm6, xmm2	; xmm6: x
	mulps xmm6, xmm2	; xmm6: x*x
	movdqu xmm4, xmm6	; xmm4: x*x
	mulps xmm4, xmm4	; xmm4: x*x*x*x
	movdqu xmm5, xmm4	; xmm5: x*x*x*x
	mulps xmm5, xmm6	; xmm5: x*x*x*x*x*x
	mulps xmm5, xmm2	; xmm5: x*x*x*x*x*x*x = x_7
	mulps xmm4, xmm2	; xmm4: x*x*x*x*x = x_5
	mulps xmm6, xmm2	; xmm6: x*x*x = x_3

	movd xmm7, [SEIS]
	shufps xmm7, xmm7, 0h	
	divps xmm6, xmm7		; x_3/6.0

	movd xmm7, [CIENTOVEINTE]
	shufps xmm7, xmm7, 0h	
	divps xmm4, xmm7		; x_5/120.0
	movd xmm7, [MUCHO]
	shufps xmm7, xmm7, 0h
	divps xmm5, xmm7		; x_7/5040.0

	subps xmm2, xmm6
	addps xmm2, xmm4
	subps xmm2, xmm5
	; fin taylor
	ret


profundidad:
	movq xmm2, r11		; xmm2: 0 0 0 x
	shufps xmm2, xmm2, 0h	; xmm2: x x x x
	movq xmm4, r10
	shufps xmm4, xmm4, 0h	; xmm4: y y y y
	
	cvtdq2ps xmm2, xmm2	; convierto a
	cvtdq2ps xmm4, xmm4	; float

	movdqu xmm3, [INDICES]
	addps xmm2, xmm3	; xmm2: x+3 x+2 x+1 x
	;addps xmm4, xmm3	; xmm4: y+3 y+2 y+1 y

	subps xmm2, xmm1	; x - x0
	subps xmm4, xmm0	; y - y0

	mulps xmm2, xmm2
	mulps xmm4, xmm4

	addps xmm2, xmm4	; (x-x0)^2 + (y-y0)^2
	sqrtps xmm2, xmm2	; raiz

	movd xmm3, [RADIUS]
	shufps xmm3, xmm3, 0h
	subps xmm2, xmm3
	movd xmm3, [WAVELENGTH]
	shufps xmm3, xmm3, 0h

	divps xmm2, xmm3	; r = (dxy-radius)/wavelength
	
	movdqu xmm4, xmm2	; xmm4: r
	movd xmm3, [UN_MEDIO]
	shufps xmm3, xmm3, 0h
	addps xmm2, xmm2
	subps xmm2, xmm3
	cvtps2dq xmm3, xmm2
	psrad xmm3, 1
	cvtdq2ps xmm3, xmm3
	movdqu xmm2, xmm4
	subps xmm2, xmm3
	


	movd xmm5, [TRAINWIDTH]	; DEBERIA SER MOVDQU?
	shufps xmm5, xmm5, 0h

	divps xmm4, xmm5	; r/TRAINWIDTH
	mulps xmm4, xmm4	; (r/TRAINWIDTH)^2

	movd xmm5, [UNOS]	; xmm5: 1 1 1 1
	shufps xmm5, xmm5, 0h
	addps xmm4, xmm5	; xmm4: 1 + (r/TRAINWIDTH)^2
		

	movdqu xmm3, xmm5
	divps xmm3, xmm4 	; xmm3: a = 1/ (xmm4)
	
	addps xmm5, xmm5	; xmm5: 2 2 2 2
	mulps xmm2, xmm5	; xmm2: k*2
	movd xmm5, [PI]		; xmm5: PI PI PI PI
	shufps xmm5, xmm5, 0h

	mulps xmm2, xmm5	; xmm2: k*2*PI
	subps xmm2, xmm5	; xmm2: k*2*PI - PI
	
	call sin_taylor
	
	mulps xmm2, xmm3	; xmm2: a* s_taylor
	; fin profundidad
	ret

ondas_asm:
	push rbp
	mov rbp, rsp
	push rbx
	push r12
	push r13
	push r14
	push r15


	movq xmm0, r9
	movd xmm1, [rbp + 16]

	shufps xmm0, xmm0, 0h	; xmm0: x_0 x_0 x_0 x_0
	shufps xmm1, xmm1, 0h	; xmm1: y_0 y_0 y_0 y_0

	cvtdq2ps xmm0, xmm0
	cvtdq2ps xmm1, xmm1

	xor r10, r10		; indice fila

proxima_fila:
	xor r11,r11		; indice columna

misma_fila:

	call profundidad
	movd xmm3, [SESENTAYCUATRO]
	shufps xmm3, xmm3, 0h
	mulps xmm2, xmm3		; xmm2: prof * 64

	movd xmm3, [rdi + r11]


	pxor xmm4, xmm4
	punpcklbw xmm3, xmm4		;xmm3: 0b0b 0b0b 0b0b 0b0b
	punpcklwd xmm3, xmm4		;xmm3 000b 000b 000b 000b
	cvtps2dq xmm2, xmm2
	paddd xmm2, xmm3
	packssdw xmm2, xmm2
	packuswb xmm2, xmm2
	
	movd [rsi + r11], xmm2

	add r11, 4

	mov r12, r11
	add r12, 4

	cmp r12, rcx
	jbe misma_fila

ultimos_bytes_fila:
	cmp r11, rcx
	je fin
	call profundidad		; xmm2: profundidad
	movd xmm3, [SESENTAYCUATRO]
	mulps xmm2, xmm3
	mov ax, [rdi + r11]

	movd xmm3, eax 

	pxor xmm4, xmm4
	punpcklbw xmm3, xmm4		;xmm3: 0b0b 0b0b 0b0b 0b0b
	punpcklwd xmm3, xmm4		;xmm3 000b 000b 000b 000b
	cvtps2dq xmm2, xmm2
	paddd xmm2, xmm3
	packssdw xmm2, xmm2
	packuswb xmm2, xmm2
	
	movd eax, xmm2

	mov [rsi + r11], ax

	inc r11
	jmp ultimos_bytes_fila
fin:
	add rdi, r8
	add rsi, r8
	xor r11, r11
	inc r10
	cmp r10, rdx
	jb proxima_fila
	
	
	pop r15
	pop r14
	pop r13
	pop r12
	pop rbx
	pop rbp
	ret
